{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 公众号：数据不吹牛\n",
    "### 更多案例和有趣分析等你来撩"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import os\n",
    "import json\n",
    "import requests\n",
    "import time\n",
    "import random\n",
    "from lxml import etree"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 设置基本网址和headers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "#两个问题基础网址\n",
    "gangwei = 'https://www.zhihu.com/api/v4/questions/266817891/answers?include=data%5B%2A%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cis_labeled%2Cis_recognized%2Cpaid_info%2Cpaid_info_content%3Bdata%5B%2A%5D.mark_infos%5B%2A%5D.url%3Bdata%5B%2A%5D.author.follower_count%2Cbadge%5B%2A%5D.topics&offset={}&limit={}&platform=desktop&sort_by=default'\n",
    "sanwen = 'https://www.zhihu.com/api/v4/questions/27329739/answers?include=data%5B%2A%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cis_labeled%2Cis_recognized%2Cpaid_info%2Cpaid_info_content%3Bdata%5B%2A%5D.mark_infos%5B%2A%5D.url%3Bdata%5B%2A%5D.author.follower_count%2Cbadge%5B%2A%5D.topics&offset={}&limit={}&platform=desktop&sort_by=default'\n",
    "\n",
    "headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "## 解析单页信息"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "def parse_page(url,headers):\n",
    "    html  = requests.get(url,headers = headers)\n",
    "    bs = json.loads(html.text)\n",
    "    result = pd.DataFrame()\n",
    "    for i in bs['data']:\n",
    "        headline = i['author']['headline'] #签名\n",
    "        gender = i['author']['gender']  #性别\n",
    "        user_type =  i['author']['user_type']\n",
    "        user_id =  i['author']['id']\n",
    "        user_token = i['author']['url_token']\n",
    "        follwer_count = i['author']['follower_count'] #关注人数\n",
    "        name = i['author']['name']   #用户昵称\n",
    "        vote_up = i['voteup_count']  #点赞数\n",
    "        updated_time = i['updated_time']    #更新时间\n",
    "        title = i['question']['title']   #问题\n",
    "        created_time = i['created_time'] #创建时间\n",
    "        comment_count = i['comment_count'] #评论数\n",
    "        can_comment = i['can_comment']['status']   #是否可以评论\n",
    "        content = i['content']  #内容，还需要再清洗\n",
    "        cache = pd.DataFrame({'用户ID':[user_id],'用户名':[name],'性别':[gender],'token':[user_token],'用户类型':[user_type],'签名':[headline],\n",
    "                              '被关注人数':[follwer_count],'创建时间':[created_time],'更新时间':[updated_time],'评论数':[comment_count],\n",
    "                              '点赞数':[vote_up],'是否可以评论':[can_comment],'内容':[content],'问题':[title]})\n",
    "        result = pd.concat([result,cache])\n",
    "    return result"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 设置爬取回答数，批量获取"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "def run_all(url,headers,num = 200):\n",
    "    final_result = pd.DataFrame()\n",
    "    for i in range(0,num,5):\n",
    "        try:\n",
    "            result = parse_page(url.format(i,5),headers)\n",
    "            final_result = pd.concat([final_result,result])\n",
    "            time.sleep(random.random())\n",
    "            print('i had parsed:',i)\n",
    "        except:\n",
    "            try:\n",
    "                time.sleep(5)\n",
    "                result = parse_page(url.format(i,5),headers)\n",
    "                final_result = pd.concat([final_result,result])\n",
    "                time.sleep(random.random())\n",
    "                print('i had parsed:',i)\n",
    "            except:\n",
    "                print(i,'is wrong~~~')  \n",
    "    return final_result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "i had parsed: 0\n",
      "i had parsed: 5\n",
      "i had parsed: 10\n",
      "i had parsed: 15\n",
      "i had parsed: 20\n",
      "i had parsed: 25\n",
      "i had parsed: 30\n",
      "i had parsed: 35\n",
      "i had parsed: 40\n",
      "i had parsed: 45\n",
      "i had parsed: 50\n",
      "i had parsed: 55\n",
      "i had parsed: 60\n",
      "i had parsed: 65\n",
      "i had parsed: 70\n",
      "i had parsed: 75\n",
      "i had parsed: 80\n",
      "i had parsed: 85\n",
      "i had parsed: 90\n",
      "i had parsed: 95\n",
      "i had parsed: 100\n",
      "i had parsed: 105\n",
      "i had parsed: 110\n",
      "i had parsed: 115\n",
      "i had parsed: 120\n",
      "i had parsed: 125\n",
      "i had parsed: 130\n",
      "i had parsed: 135\n",
      "i had parsed: 140\n",
      "i had parsed: 145\n",
      "i had parsed: 150\n",
      "i had parsed: 155\n",
      "i had parsed: 160\n",
      "i had parsed: 165\n",
      "i had parsed: 170\n",
      "i had parsed: 175\n",
      "i had parsed: 180\n",
      "i had parsed: 185\n",
      "i had parsed: 190\n",
      "i had parsed: 195\n",
      "i had parsed: 200\n",
      "i had parsed: 205\n",
      "i had parsed: 210\n",
      "i had parsed: 215\n",
      "i had parsed: 220\n",
      "i had parsed: 225\n",
      "i had parsed: 230\n",
      "i had parsed: 235\n",
      "i had parsed: 240\n",
      "i had parsed: 245\n",
      "i had parsed: 250\n",
      "i had parsed: 255\n",
      "i had parsed: 260\n",
      "i had parsed: 265\n",
      "i had parsed: 270\n",
      "i had parsed: 275\n",
      "i had parsed: 280\n",
      "i had parsed: 285\n",
      "i had parsed: 290\n",
      "i had parsed: 295\n",
      "i had parsed: 300\n",
      "i had parsed: 305\n",
      "i had parsed: 310\n",
      "i had parsed: 315\n",
      "i had parsed: 320\n",
      "i had parsed: 325\n",
      "i had parsed: 330\n",
      "i had parsed: 335\n",
      "i had parsed: 340\n",
      "i had parsed: 345\n",
      "i had parsed: 350\n",
      "i had parsed: 355\n",
      "i had parsed: 360\n",
      "i had parsed: 365\n",
      "i had parsed: 370\n",
      "i had parsed: 375\n",
      "i had parsed: 380\n",
      "i had parsed: 385\n",
      "i had parsed: 390\n",
      "i had parsed: 395\n",
      "i had parsed: 400\n",
      "i had parsed: 405\n",
      "i had parsed: 410\n",
      "i had parsed: 415\n",
      "i had parsed: 420\n",
      "i had parsed: 425\n",
      "i had parsed: 430\n",
      "i had parsed: 435\n",
      "i had parsed: 440\n",
      "i had parsed: 445\n",
      "i had parsed: 450\n",
      "i had parsed: 455\n",
      "i had parsed: 460\n",
      "i had parsed: 465\n",
      "i had parsed: 470\n",
      "i had parsed: 475\n",
      "i had parsed: 480\n",
      "i had parsed: 485\n",
      "i had parsed: 490\n",
      "i had parsed: 495\n",
      "i had parsed: 500\n",
      "i had parsed: 505\n",
      "i had parsed: 510\n",
      "i had parsed: 515\n",
      "i had parsed: 520\n",
      "i had parsed: 525\n",
      "i had parsed: 530\n",
      "i had parsed: 535\n",
      "i had parsed: 540\n",
      "i had parsed: 545\n",
      "i had parsed: 550\n",
      "i had parsed: 555\n",
      "i had parsed: 560\n",
      "i had parsed: 565\n",
      "i had parsed: 570\n",
      "i had parsed: 575\n",
      "i had parsed: 580\n",
      "i had parsed: 585\n",
      "i had parsed: 590\n",
      "i had parsed: 595\n",
      "i had parsed: 600\n",
      "i had parsed: 605\n",
      "i had parsed: 610\n",
      "i had parsed: 615\n",
      "i had parsed: 620\n",
      "i had parsed: 625\n",
      "i had parsed: 630\n",
      "i had parsed: 635\n",
      "i had parsed: 640\n",
      "i had parsed: 645\n",
      "i had parsed: 650\n",
      "i had parsed: 655\n",
      "i had parsed: 660\n",
      "i had parsed: 665\n",
      "i had parsed: 670\n",
      "i had parsed: 675\n",
      "i had parsed: 680\n",
      "i had parsed: 685\n",
      "i had parsed: 690\n",
      "i had parsed: 695\n",
      "i had parsed: 700\n",
      "i had parsed: 705\n",
      "i had parsed: 710\n",
      "i had parsed: 715\n",
      "i had parsed: 720\n",
      "i had parsed: 725\n",
      "i had parsed: 730\n",
      "i had parsed: 735\n",
      "i had parsed: 740\n",
      "i had parsed: 745\n",
      "i had parsed: 750\n",
      "i had parsed: 755\n",
      "i had parsed: 760\n",
      "i had parsed: 765\n",
      "i had parsed: 770\n"
     ]
    }
   ],
   "source": [
    "final_result = run_all(sanwen,headers,775)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 单个爬取用户信息，会存在ip短暂被ban的情况，如果想要持续稳定，建议准备好IP"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_ips():\n",
    "    #交给你了朋友，事先准备好ip，每次调用一个回来就OK"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'https': 'https://117.57.35.166:4512'}"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "get_ips()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 逐个爬取用户信息，获取行业岗位等信息"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_user_info(user,headers,ip):\n",
    "    user_url = 'https://www.zhihu.com/people/{}/activities'\n",
    "    ht = requests.get(user_url.format(user),headers = headers,proxies = ip)\n",
    "    if ht.text.find('安全验证') == -1:\n",
    "        \n",
    "        bs = etree.HTML(ht.text)\n",
    "        try:\n",
    "            hangye = bs.xpath('//div[@class = \"ProfileHeader-infoItem\"][1]/text()')\n",
    "        except:\n",
    "            hangye = None\n",
    "        try:\n",
    "            school = bs.xpath('//div[@class = \"ProfileHeader-infoItem\"][2]/text()')[0]\n",
    "        except:\n",
    "            school = None\n",
    "        try:\n",
    "            prof = bs.xpath('//div[@class = \"ProfileHeader-infoItem\"][2]/text()')[1]\n",
    "        except:\n",
    "            prof = None\n",
    "        df = pd.DataFrame({'token':[user],'行业':[hangye],'教育经历':[school],'专业':[prof]})\n",
    "        \n",
    "    else:\n",
    "        \n",
    "        ip = get_ips()\n",
    "        try:\n",
    "            ht = requests.get(user_url.format(user),headers = headers,proxies = ip)\n",
    "        except:\n",
    "            ip = get_ips()\n",
    "            ht = requests.get(user_url.format(user),headers = headers,proxies = ip)\n",
    "            bs = etree.HTML(ht.text)\n",
    "        try:\n",
    "            hangye = bs.xpath('//div[@class = \"ProfileHeader-infoItem\"][1]/text()')\n",
    "        except:\n",
    "            hangye = None\n",
    "        try:\n",
    "            school = bs.xpath('//div[@class = \"ProfileHeader-infoItem\"][2]/text()')[0]\n",
    "        except:\n",
    "            school = None\n",
    "        try:\n",
    "            prof = bs.xpath('//div[@class = \"ProfileHeader-infoItem\"][2]/text()')[1]\n",
    "        except:\n",
    "            prof = None\n",
    "        df = pd.DataFrame({'token':[user],'行业':[hangye],'教育经历':[school],'专业':[prof]})\n",
    "        print('ip changes')\n",
    "        \n",
    "    return df,ip"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 循环爬取用户信息"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ip = get_ips()\n",
    "ct = 1\n",
    "user_info2 = pd.DataFrame()\n",
    "for i in final_result['token']:\n",
    "    df,ip = get_user_info(i,headers,ip)\n",
    "    user_info2 = pd.concat([user_info2,df])\n",
    "    time.sleep(random.random() / 2)\n",
    "    print('i had parsed:{}'.format(ct))\n",
    "    ct += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 128,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
